player_PGstats_2021.csv – NBA players statistics per game in
2021-2022 season
source: https://www.basketball-reference.com/leagues/NBA_2022_per_game.html
player_Adstats_2021.csv – NBA players advance statistics in 2021-2022
season source: https://www.basketball-reference.com/leagues/NBA_2022_advanced.html
salary2022.csv – NBA players contract in 2022-2023 season onward
source: https://www.basketball-reference.com/contracts/players.html
bio.csv – NBA player bio (height and weight)
source: https://www.nba.com/stats/players/bio/?Season=2021-22&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=1
library(knitr)
library(plyr)
library(dplyr)
library(tidyr)
library(caret)
library(ggplot2)
library(corrplot)
library(stringr)
library(scales)
library(randomForest)
library(glmnet)
library(rpart)
library(lubridate)
library(plotly)
library(forcats)
library(ggExtra)
library(reshape2)
library(tree)
library(MASS)
library(Metrics)
library(neuralnet)
library(sigmoid)
opts_chunk$set(echo = TRUE, cache = TRUE)
opts_chunk$set(tidy.opts = list(width.cutoff = 80), tidy = TRUE, fig.height = 6, fig.width = 9)
pgstats <- read.csv("files/2022/player_PGstats_2021.csv")
adstats <- read.csv("files/2022/player_Adstats_2021.csv")
salary <- read.csv("files/2022/salary2022.csv")
bio <- read.csv("files/2022/bio.csv")
NBA players statistics per game in 2021-2022 season
source: https://www.basketball-reference.com/leagues/NBA_2022_per_game.html
dim(pgstats)
## [1] 812 31
str(pgstats)
## 'data.frame': 812 obs. of 31 variables:
## $ Rk : int 1 2 3 4 5 6 6 6 7 8 ...
## $ Player : chr "Precious Achiuwa" "Steven Adams" "Bam Adebayo" "Santi Aldama" ...
## $ Pos : chr "C" "C" "C" "PF" ...
## $ Age : int 22 28 24 21 36 23 23 23 26 23 ...
## $ Tm : chr "TOR" "MEM" "MIA" "MEM" ...
## $ G : int 73 76 56 32 47 65 50 15 66 56 ...
## $ GS : int 28 75 56 0 12 21 19 2 61 56 ...
## $ MP : num 23.6 26.3 32.6 11.3 22.3 22.6 26.3 9.9 27.3 32.3 ...
## $ FG : num 3.6 2.8 7.3 1.7 5.4 3.9 4.7 1.1 3.9 6.6 ...
## $ FGA : num 8.3 5.1 13 4.1 9.7 10.5 12.6 3.2 8.6 9.7 ...
## $ FG. : num 0.439 0.547 0.557 0.402 0.55 0.372 0.375 0.333 0.448 0.677 ...
## $ X3P : num 0.8 0 0 0.2 0.3 1.6 1.9 0.7 2.4 0 ...
## $ X3PA : num 2.1 0 0.1 1.5 1 5.2 6.1 2.2 5.9 0.2 ...
## $ X3P. : num 0.359 0 0 0.125 0.304 0.311 0.311 0.303 0.409 0.1 ...
## $ X2P : num 2.9 2.8 7.3 1.5 5.1 2.3 2.8 0.4 1.5 6.6 ...
## $ X2PA : num 6.1 5 12.9 2.6 8.8 5.3 6.5 1 2.7 9.6 ...
## $ X2P. : num 0.468 0.548 0.562 0.56 0.578 0.433 0.434 0.4 0.533 0.688 ...
## $ eFG. : num 0.486 0.547 0.557 0.424 0.566 0.449 0.45 0.438 0.588 0.678 ...
## $ FT : num 1.1 1.4 4.6 0.6 1.9 1.2 1.4 0.7 1 2.9 ...
## $ FTA : num 1.8 2.6 6.1 1 2.2 1.7 1.9 0.8 1.1 4.2 ...
## $ FT. : num 0.595 0.543 0.753 0.625 0.873 0.743 0.722 0.917 0.865 0.708 ...
## $ ORB : num 2 4.6 2.4 1 1.6 0.6 0.7 0.1 0.5 3.4 ...
## $ DRB : num 4.5 5.4 7.6 1.7 3.9 2.3 2.6 1.5 2.9 7.3 ...
## $ TRB : num 6.5 10 10.1 2.7 5.5 2.9 3.3 1.5 3.4 10.8 ...
## $ AST : num 1.1 3.4 3.4 0.7 0.9 2.4 2.8 1.1 1.5 1.6 ...
## $ STL : num 0.5 0.9 1.4 0.2 0.3 0.7 0.8 0.3 0.7 0.8 ...
## $ BLK : num 0.6 0.8 0.8 0.3 1 0.4 0.4 0.3 0.3 1.3 ...
## $ TOV : num 1.2 1.5 2.6 0.5 0.9 1.4 1.7 0.5 0.7 1.7 ...
## $ PF : num 2.1 2 3.1 1.1 1.7 1.6 1.8 1 1.5 1.7 ...
## $ PTS : num 9.1 6.9 19.1 4.1 12.9 10.6 12.8 3.5 11.1 16.1 ...
## $ player_id: chr "achiupr01" "adamsst01" "adebaba01" "aldamsa01" ...
player_Adstats_2021.csv – NBA players advance statistics in 2021-2022 season source: https://www.basketball-reference.com/leagues/NBA_2022_advanced.html
dim(adstats)
## [1] 812 30
str(adstats)
## 'data.frame': 812 obs. of 30 variables:
## $ Rk : int 1 2 3 4 5 6 6 6 7 8 ...
## $ Player : chr "Precious Achiuwa" "Steven Adams" "Bam Adebayo" "Santi Aldama" ...
## $ Pos : chr "C" "C" "C" "PF" ...
## $ Age : int 22 28 24 21 36 23 23 23 26 23 ...
## $ Tm : chr "TOR" "MEM" "MIA" "MEM" ...
## $ G : int 73 76 56 32 47 65 50 15 66 56 ...
## $ MP : int 1725 1999 1825 360 1050 1466 1317 149 1805 1809 ...
## $ PER : num 12.7 17.6 21.8 10.2 19.6 10.5 10.5 10.2 12.7 23 ...
## $ TS. : num 0.503 0.56 0.608 0.452 0.604 0.475 0.474 0.497 0.609 0.698 ...
## $ X3PAr : num 0.259 0.003 0.008 0.364 0.1 0.497 0.483 0.688 0.684 0.018 ...
## $ FTr : num 0.217 0.518 0.466 0.242 0.223 0.16 0.153 0.25 0.13 0.428 ...
## $ ORB. : num 8.7 17.9 8.7 9.4 7.8 2.7 3 0.8 1.9 12 ...
## $ DRB. : num 21.7 22 26.1 16.1 18.7 11.5 11 15.6 10.9 24.5 ...
## $ TRB. : num 14.9 19.9 17.5 12.6 13.4 7.1 6.9 8.5 6.5 18.4 ...
## $ AST. : num 6.9 16.1 17.5 7.7 6.3 16.1 16.1 15.5 7.6 8.2 ...
## $ STL. : num 1.1 1.6 2.2 0.8 0.6 1.5 1.5 1.7 1.2 1.2 ...
## $ BLK. : num 2.3 2.7 2.6 2.5 4 1.5 1.4 2.4 1 3.7 ...
## $ TOV. : num 11.3 19.6 14.4 9.9 8 11.3 11.2 13.1 6.7 12.7 ...
## $ USG. : num 18.5 12 25 18.4 22.4 24.1 24.8 17.9 15.2 18.1 ...
## $ X : logi NA NA NA NA NA NA ...
## $ OWS : num 0.4 3.8 3.6 -0.1 2.1 -1.1 -1.1 0 2.8 5.4 ...
## $ DWS : num 2.1 3 3.5 0.4 1 1.1 0.9 0.2 1.4 3 ...
## $ WS : num 2.5 6.8 7.2 0.3 3.1 0.1 -0.1 0.2 4.2 8.5 ...
## $ WS.48 : num 0.07 0.163 0.188 0.044 0.141 0.003 -0.005 0.07 0.11 0.225 ...
## $ X.1 : logi NA NA NA NA NA NA ...
## $ OBPM : num -2 1 1.7 -4.2 1.3 -1.8 -1.7 -2.9 0.6 2.7 ...
## $ DBPM : num -0.6 1 2.1 -1.5 -0.6 -1.1 -1.3 1.2 -0.2 1.2 ...
## $ BPM : num -2.6 2 3.8 -5.7 0.7 -2.9 -3 -1.7 0.4 3.9 ...
## $ VORP : num -0.2 2 2.7 -0.3 0.7 -0.3 -0.3 0 1.1 2.7 ...
## $ player_id: chr "achiupr01" "adamsst01" "adebaba01" "aldamsa01" ...
salary2022.csv – NBA players contract in 2022-2023 season
onward
source: https://www.basketball-reference.com/contracts/players.html
dim(salary)
## [1] 448 12
str(salary)
## 'data.frame': 448 obs. of 12 variables:
## $ Rk : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Player : chr "Stephen Curry" "Russell Westbrook" "LeBron James" "Kevin Durant" ...
## $ Tm : chr "GSW" "LAL" "LAL" "BRK" ...
## $ X2022.23 : chr "$48070014" "$47063478" "$44474988" "$44119845" ...
## $ X2023.24 : chr "$51915615" "" "" "$46407433" ...
## $ X2024.25 : chr "$55761216" "" "" "$49856021" ...
## $ X2025.26 : chr "$59606817" "" "" "$53282609" ...
## $ X2026.27 : chr "" "" "" "" ...
## $ X2027.28 : chr "" "" "" "" ...
## $ Signed.Using: chr "Bird" "Bird Rights" "Bird" "Bird" ...
## $ Guaranteed : chr "$215353662" "$47063478" "$44474988" "$193665908" ...
## $ player_id : chr "curryst01" "westbru01" "jamesle01" "duranke01" ...
bio.csv – NBA player bio (height and weight)
source: https://www.nba.com/stats/players/bio/?Season=2021-22&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=1
dim(bio)
## [1] 605 4
str(bio)
## 'data.frame': 605 obs. of 4 variables:
## $ Player: chr "Aaron Gordon" "Aaron Henry" "Aaron Holiday" "Aaron Nesmith" ...
## $ Team : chr "DEN" "PHI" "PHX" "BOS" ...
## $ Weight: int 235 210 185 215 190 225 200 241 174 240 ...
## $ Height: chr "6-8" "6-5" "6-0" "6-5" ...
I will merge the tables by their primary key (pgstats.player_id) and foreign key (salary.player_id) by inner join (only take the entries which exist). I will treat the players that received salary but have not played any game as outliers.
merged <- merge(pgstats, adstats, by = c("player_id", "Tm"))
Since there is players that has changed team in the middle of the season, I will use the total stats (indicated by Tm = “TOT”).
repeated <- names(which(table(merged$player_id) > 1))
for (i in 1:length(repeated)) {
id <- repeated[i]
use <- merged[which(merged$player_id == id & merged$Tm == "TOT"), ]
temp_tm <- merged$Tm[max(which(merged$player_id == id))]
merged <- merged[merged$player_id != id, ]
merged <- rbind(merged, use)
merged$Tm[merged$player_id == id] <- temp_tm
}
I will also rename some variable to increase readability
merged <- merged %>%
dplyr::select(!c(Rk.y, Player.y, Pos.y, Age.y, MP.y, G.y, X, X.1)) %>%
rename(Rk = Rk.x, Player = Player.x, Position = Pos.x, Age = Age.x, Game_played = G.x,
FGpct = FG., X3Ppct = X3P., X2Ppct = X2P., eFGpct = eFG., FTpct = FT., TSpct = TS.,
ORBpct = ORB., DRBpct = DRB., TRBpct = TRB., ASTpct = AST., STLpct = STL.,
BLKpct = BLK., TOVpct = TOV., USGpct = USG., WSper48 = WS.48, Game_started = GS,
MP = MP.x)
salary <- salary %>%
group_by(Player, player_id, X2022.23) %>%
summarise(Tm = Tm[1], Signed.Using = Signed.Using[1], Guaranteed = sum(as.numeric(grep(pattern = "[0-9]+",
Guaranteed))), Rk = Rk[1])
merged <- merge(merged, salary, by = c("player_id"))
bio <- bio %>%
dplyr::select(!Team) %>%
rename(Player.x = Player)
merged <- merge(merged, bio, by = c("Player.x"), all.x = TRUE)
write.csv(merged, "dataset/all2022.csv")
all <- as.data.frame(read.csv("dataset/all2022.csv"))
dim(all)
## [1] 384 60
There are 384 entries.
str(all)
## 'data.frame': 384 obs. of 60 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Player.x : chr "Aaron Gordon" "Aaron Holiday" "Aaron Nesmith" "Aaron Wiggins" ...
## $ player_id : chr "gordoaa01" "holidaa01" "nesmiaa01" "wiggiaa01" ...
## $ Tm.x : chr "DEN" "WAS" "BOS" "OKC" ...
## $ Rk.x : int 198 244 406 581 250 85 448 97 328 497 ...
## $ Position : chr "PF" "PG" "SF" "SG" ...
## $ Age : int 26 25 22 23 35 30 20 27 28 19 ...
## $ Game_played : int 75 63 52 50 69 81 61 41 39 72 ...
## $ Game_started: int 75 15 3 35 69 44 12 18 10 13 ...
## $ MP : num 31.7 16.2 11 24.2 29.1 28.6 20.2 28 15.9 20.7 ...
## $ FG : num 5.8 2.4 1.4 3.1 3.9 3.5 3 2.5 2.4 3.5 ...
## $ FGA : num 11.1 5.4 3.5 6.7 8.2 9 7.5 6.2 4.5 7.3 ...
## $ FGpct : num 0.52 0.447 0.396 0.463 0.467 0.391 0.408 0.398 0.534 0.474 ...
## $ X3P : num 1.2 0.6 0.6 0.8 1.3 1.9 0.9 1 0.2 0.4 ...
## $ X3PA : num 3.5 1.6 2.2 2.8 3.8 4.8 3.2 3.1 0.5 1.6 ...
## $ X3Ppct : num 0.335 0.379 0.27 0.304 0.336 0.404 0.289 0.333 0.286 0.248 ...
## $ X2P : num 4.6 1.8 0.8 2.3 2.6 1.6 2.1 1.5 2.3 3.1 ...
## $ X2PA : num 7.7 3.7 1.3 4 4.4 4.2 4.2 3.2 4 5.7 ...
## $ X2Ppct : num 0.605 0.477 0.612 0.573 0.582 0.378 0.498 0.462 0.568 0.539 ...
## $ eFGpct : num 0.573 0.504 0.481 0.525 0.546 0.499 0.47 0.48 0.551 0.502 ...
## $ FT : num 2.3 0.9 0.4 1.2 1.2 2.7 0.6 1.4 1.1 2.3 ...
## $ FTA : num 3.1 1.1 0.5 1.7 1.4 3.3 0.8 1.8 1.6 3.2 ...
## $ FTpct : num 0.743 0.868 0.808 0.729 0.842 0.822 0.7 0.795 0.651 0.711 ...
## $ ORB : num 1.7 0.4 0.3 1 1.6 0.6 1.2 0.8 1.3 1.9 ...
## $ DRB : num 4.2 1.6 1.4 2.5 6.1 4.3 4 2.8 2.8 3.5 ...
## $ TRB : num 5.9 1.9 1.7 3.6 7.7 4.9 5.2 3.6 4.1 5.5 ...
## $ AST : num 2.5 2.4 0.4 1.4 3.4 3 2.1 4 1.2 2.6 ...
## $ STL : num 0.6 0.7 0.4 0.6 0.7 1 0.6 1.7 0.3 0.8 ...
## $ BLK : num 0.6 0.1 0.1 0.2 1.3 0.3 0.6 0.4 0.6 0.9 ...
## $ TOV : num 1.8 1.1 0.6 1.1 0.9 1.1 1.5 1.4 1.1 2 ...
## $ PF : num 2 1.5 1.3 1.9 1.9 2.7 1.4 2.6 2.6 3 ...
## $ PTS : num 15 6.3 3.8 8.3 10.2 11.7 7.6 7.4 6 9.6 ...
## $ PER : num 15.3 12.6 7.3 10.3 16.7 13.7 12 11.7 13.1 16 ...
## $ TSpct : num 0.602 0.544 0.507 0.556 0.574 0.559 0.485 0.528 0.577 0.552 ...
## $ X3PAr : num 0.312 0.305 0.632 0.409 0.466 0.534 0.432 0.492 0.119 0.223 ...
## $ FTr : num 0.276 0.201 0.143 0.252 0.167 0.363 0.11 0.285 0.358 0.442 ...
## $ ORBpct : num 6.1 2.6 2.9 4.3 6 2.1 6 3.3 9 10.1 ...
## $ DRBpct : num 14.3 10.3 13.6 11 22.2 16.6 20.8 11.2 19.3 18.9 ...
## $ TRBpct : num 10.3 6.5 8.4 7.6 14.3 9.2 13.3 7.3 14.1 14.5 ...
## $ ASTpct : num 11.6 20.7 5.4 8.5 16.4 15.7 16.5 18.5 10.6 19.1 ...
## $ STLpct : num 0.9 2 1.7 1.2 1.2 1.8 1.5 3 1 1.9 ...
## $ BLKpct : num 1.7 0.7 0.8 0.8 4.2 1.1 2.9 1.1 3.5 4.1 ...
## $ TOVpct : num 12.5 15.4 13.8 12.6 9.6 9.7 16.4 16.5 17.4 18.8 ...
## $ USGpct : num 19.7 18.7 17.2 15.3 14.8 17.7 20 13.3 17.1 22 ...
## $ OWS : num 3.2 0.5 -0.4 0.5 3.7 3.2 -1.1 0.7 0.4 0.8 ...
## $ DWS : num 2 0.9 0.9 0.8 3.8 2.9 1.5 1.2 0.5 1.3 ...
## $ WS : num 5.2 1.5 0.4 1.2 7.6 6.1 0.4 2 0.9 2.1 ...
## $ WSper48 : num 0.105 0.068 0.038 0.048 0.181 0.126 0.014 0.082 0.07 0.068 ...
## $ OBPM : num 0.5 -1.9 -4.9 -3.4 1.4 -0.4 -2.7 -2.2 -3.2 -1.6 ...
## $ DBPM : num -1.1 0.3 0.7 -0.9 2.9 1.2 0.1 2.3 0.3 0.6 ...
## $ BPM : num -0.6 -1.7 -4.3 -4.3 4.3 0.8 -2.6 0.2 -2.9 -1 ...
## $ VORP : num 0.9 0.1 -0.3 -0.7 3.2 1.7 -0.2 0.6 -0.1 0.4 ...
## $ Player.y : chr "Aaron Gordon" "Aaron Holiday" "Aaron Nesmith" "Aaron Wiggins" ...
## $ X2022.23 : chr "$19690909" "$1968175" "$3804360" "$1563518" ...
## $ Tm.y : chr "DEN" "ATL" "IND" "OKC" ...
## $ Signed.Using: chr "Bird" "Minimum Salary" "1st Round Pick" "" ...
## $ Guaranteed : int 1 1 1 0 1 1 1 1 1 1 ...
## $ Rk.y : int 63 362 266 430 48 139 281 154 261 278 ...
## $ Weight : int 235 185 215 190 240 214 NA 186 250 NA ...
## $ Height : chr "6-8" "6-0" "6-5" "6-4" ...
sum(all$Player.x != all$Player.y)
## [1] 1
Since there is no difference in the player name, I will remove player.y and renaming player.x to name
all <- all %>%
dplyr::select(!Player.y) %>%
rename(name = Player.x)
It is rank in their original respective table (alphabetical order of
player name in player statistics tables and salary in 2022-23 season for
salary table).
Since, it doesn’t carry any extra information, I will remove both of the
variables.
all <- all %>%
dplyr::select(!c(Rk.x, Rk.y))
Tm.x is the team the player in in 2021-22 season while Tm.y is the team of 2022-23 season. I will change Tm.x to team_2021 and Tm.y to team_2022.
all <- all %>%
rename(team_2021 = Tm.x, team_2022 = Tm.y)
The aim of this project is to predict the salary next year. I will remove the salary of 2023-24 season onward and change the X2022.23 to numeric variables.
all <- all %>%
rename(salary = X2022.23) %>%
mutate(salary = as.numeric(str_extract(salary, "[0-9]+")))
plot_ly(data = all, x = ~salary, type = "histogram", nbinsx = 30) %>%
layout(title = "Frequency Diagram of NBA salary in 2022-23 season", xaxis = list(title = "yearly salary (USD)"),
yaxis = list(title = "frequency"))
## Warning: Ignoring 1 observations
The salary is highly right skewed and there is high frequency concentrated on 0 to 2 million range. This might be because of the existence of minimum salary in NBA, which is 1 million to 3 million per year depending on their experience (Adams, 2022). I will keep that in mind.
summary(all$salary)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 333333 2196960 5837760 10131574 13955840 48070014 1
I will first use the correlation with salary to get a feel on the numeric variables on the response variables
numVar <- which(sapply(all, is.numeric))
numVarNames <- names(numVar)
length(numVarNames)
## [1] 50
There are 50 numeric variables
all_numVar <- all[, numVar]
all_numVar <- dplyr::select(all_numVar, !X)
cor_Mat <- cor(all_numVar, use = "pairwise.complete.obs")
cor_names <- names(sort(cor_Mat[, "salary"], decreasing = TRUE))[1:20]
cor_Mat <- cor_Mat[cor_names, cor_names]
corrplot.mixed(cor_Mat, tl.pos = "lt")
Pts:
Points per game
It has the highest correlation with salary among the numeric variables (0.7944907). It is the average point per game played.
ggplotly(ggplot(all %>%
drop_na(PTS, salary), aes(x = PTS, y = salary)) + geom_point(col = "blue") +
geom_smooth(formula = y ~ x, method = "loess") + labs(title = "points per game in NBA 2021-22 vs salary in NBA 2022-23",
x = "points per game", y = "yearly salary (USD)"))
There is a clear linear correlation between salary and points per game. The correlation is smaller when the points per game is below about 9 but increase after it goes above 9 points.
ggplotly(ggplot(all %>%
drop_na(PTS, salary), aes(x = PTS)) + geom_histogram(bins = 30) + labs(title = "Frequency Distribution of points per game in NBA 2021-22",
x = "points per game", y = "Frequency"))
Value Over Replacement Player:
VORP - Value Over Replacement Player (available since the 1973-74 season in the NBA); a box score estimate of the points per 100 TEAM possessions that a player contributed above a replacement-level (-2.0) player, translated to an average team and prorated to an 82-game season.
Although FG, FGA, FT etc are more highly correlated, they are also highly correlated to points per game (> 0.75). I will look at the next one that is not highly correlated to points per game. It has a correlation of (0.6645534) with salary.
ggplotly(ggplot(all %>%
drop_na(VORP, salary), aes(x = VORP, y = salary)) + geom_point(col = "blue") +
geom_smooth(formula = y ~ x, method = "loess") + geom_smooth(formula = y ~ x,
method = "glm", linetype = "dotted", col = "red", se = FALSE) + labs(title = "Value over replacement player in NBA 2021-22 vs salary in NBA 2022-23",
x = "VORP", y = "yearly salary (USD)"))
It shows clear linear correlation except some in both extreme of the VORP.
Assists:
Assists per game
It has a high correlation with salary while not a having such a high correlation with points per game. It has a correlation of (0.6154281) with salary.
ggplotly(ggplot(all %>%
drop_na(AST, salary), aes(x = AST, y = salary)) + geom_point(col = "blue") +
geom_smooth(formula = y ~ x, method = "loess") + geom_smooth(formula = y ~ x,
method = "glm", linetype = "dotted", col = "red", se = FALSE) + labs(title = "Assists per game in NBA 2021-22 vs salary in NBA 2022-23",
x = "Assists per game", y = "yearly salary (USD)"))
It show positive correlation until it goes above 6 assists per game where it shows negative correlation. This maybe explained by that the players with high assist are usually not the first attacking choice of the team which might explain by they are pay less.
Nacol <- names(which(colSums(is.na(all) | all == "") > 0))
sort(colSums(sapply(all[Nacol], function(x) is.na(x) | x == "")), decreasing = TRUE)
## Signed.Using Weight Height X3Ppct Position FTpct
## 49 17 17 14 8 5
## salary
## 1
kable(all[is.na(all$salary), c("X", "name", "salary")])
| X | name | salary | |
|---|---|---|---|
| 146 | 146 | Ish Wainright | NA |
The salary of Ish Wainright is 125000 USD spotrac (n.d.).
all$salary[all$name == "Ish Wainright"] <- 125000
Signed.Using:
The type of contract use to sign
I will impute by changing all NA to “None”.
unique(all$Signed.Using)
## [1] "Bird" "Minimum Salary" "1st Round Pick"
## [4] "" "Cap Space" "MLE"
## [7] "Bi-Annual Exception" "Mini MLE" "Early Bird"
## [10] "1st Round pick" "Sign and Trade" "Bird Rights"
## [13] "Room Exception" "1st round pick" "Non Bird"
## [16] "Cap space"
all$Signed.Using[grep("^1st [Rr]ound [Pp]ick", all$Signed.Using)] <- "1st round pick"
all$Signed.Using[grep("Cap [Ss]pace", all$Signed.Using)] <- "Cap space"
all$Signed.Using[is.na(all$Signed.Using) | all$Signed.Using == ""] <- "None"
ggplotly(ggplot(all, aes(x = fct_reorder(as.factor(Signed.Using), salary, .fun = "mean"),
y = salary, fill = Signed.Using)) + geom_boxplot() + geom_point(stat = "summary",
fun = "mean") + labs(title = "Type of contract vs salary", x = "Type of contract",
y = "yearly salary (USD)") + theme(axis.text.x = element_text(angle = 45, hjust = 1)))
kable(all[which(is.na(all$Weight) | is.na(all$Height)), c("name", "Weight", "Height")])
| name | Weight | Height | |
|---|---|---|---|
| 7 | Aleksej Pokusevski | NA | NA |
| 10 | Alperen Şengün | NA | NA |
| 22 | Boban Marjanović | NA | NA |
| 24 | Bogdan Bogdanović | NA | NA |
| 25 | Bojan Bogdanović | NA | NA |
| 29 | Brandon Boston Jr. | NA | NA |
| 75 | Dāvis Bertāns | NA | NA |
| 185 | Jonas Valančiūnas | NA | NA |
| 204 | Jusuf Nurkić | NA | NA |
| 245 | Luka Dončić | NA | NA |
| 253 | Marcus Morris | NA | NA |
| 285 | Nikola Jokić | NA | NA |
| 286 | Nikola Vučević | NA | NA |
| 315 | Robert Williams | NA | NA |
| 346 | Théo Maledon | NA | NA |
| 373 | Vlatko Čančar | NA | NA |
| 378 | Willy Hernangómez | NA | NA |
I will manually search up their height and weight
all$Weight[which(is.na(all$Weight))] <- c(190, 235, 290, 220, 226, 185, 225, 265,
290, 230, 218, 284, 260, 237, 175, 236, 250)
all$Height[which(is.na(all$Height))] <- c("7-0", "6-9", "7-3", "6-6", "6-7", "6-7",
"6-10", "6-11", "6-11", "6-7", "6-8", "6-11", "6-10", "6-8", "6-4", "6-8", "6-11")
heightinch <- as.numeric(sapply(all$Height, function(x) substring(x, 1, 1))) * 12 +
as.numeric(sapply(all$Height, function(x) substring(x, 3, nchar(x))))
all$Height <- heightinch
X3Ppct:
3 point field goal percentage
kable(all[which(is.na(all$X3Ppct)), c("name", "X3P", "X3PA", "X3Ppct")])
| name | X3P | X3PA | X3Ppct | |
|---|---|---|---|---|
| 21 | Bismack Biyombo | 0 | 0 | NA |
| 82 | DeAndre Jordan | 0 | 0 | NA |
| 147 | Ivica Zubac | 0 | 0 | NA |
| 150 | Jaden Springer | 0 | 0 | NA |
| 176 | Jericho Sims | 0 | 0 | NA |
| 269 | Mitchell Robinson | 0 | 0 | NA |
| 273 | Moses Brown | 0 | 0 | NA |
| 280 | Nic Claxton | 0 | 0 | NA |
| 281 | Nick Richards | 0 | 0 | NA |
| 291 | Onyeka Okongwu | 0 | 0 | NA |
| 351 | Tony Bradley | 0 | 0 | NA |
| 364 | Tyrell Terry | 0 | 0 | NA |
| 368 | Udoka Azubuike | 0 | 0 | NA |
| 370 | Vernon Carey Jr. | 0 | 0 | NA |
I will impute by setting to 0 if there is no 3 point attempt.
all$X3Ppct[which(is.na(all$X3Ppct))] <- sapply(which(is.na(all$X3Ppct)), function(x) ifelse(all$X3PA[x] ==
0, 0, all$X3P[x]/all$X3PA[x]))
ggplotly(ggplot(all, aes(x = X3Ppct, y = salary)) + geom_point() + geom_smooth(col = "red",
formula = y ~ x, method = "glm") + labs(title = "3 point percentage vs salary",
x = "3 point field goal percentage", y = "yearly salary (USD)"))
It shows a slight but not significant correlation between salary and 3 point percentage.
ggplotly(ggplot(all, aes(x = X3Ppct, y = salary, col = Position)) + geom_point() +
geom_smooth(formula = y ~ x, method = "glm") + facet_grid(Position ~ .) + labs(title = "3 point percentage vs salary",
x = "3 point field goal percentage", y = "yearly salary (USD)"))
kable(all[is.na(all$Position), c("name", "Position")])
| name | Position | |
|---|---|---|
| 18 | Armoni Brooks | NA |
| 97 | Didi Louzada | NA |
| 99 | Domantas Sabonis | NA |
| 202 | Justin Holiday | NA |
| 237 | Larry Nance Jr. | NA |
| 287 | Norman Powell | NA |
| 331 | Spencer Dinwiddie | NA |
| 365 | Tyrese Haliburton | NA |
I will impute manually by search up their primary position
all$Position[is.na(all$Position)] <- c("SG", "SF", "PF", "SG", "PF", "SG", "PG",
"PG")
FTpct:
Free throw percentage
kable(all[which(is.na(all$FTpct)), c("FT", "FTA", "FTpct")])
| FT | FTA | FTpct | |
|---|---|---|---|
| 150 | 0 | 0 | NA |
| 205 | 0 | 0 | NA |
| 251 | 0 | 0 | NA |
| 325 | 0 | 0 | NA |
| 364 | 0 | 0 | NA |
I will impute by setting to 0 if there is no free throw attempt.
all$FTpct[which(is.na(all$FTpct))] <- sapply(which(is.na(all$FTpct)), function(x) ifelse(all$FTA[x] ==
0, 0, all$FT[x]/all$FTA[x]))
ggplotly(ggplot(all, aes(x = FTpct, y = salary)) + geom_point() + geom_smooth(formula = y ~
x, method = "glm") + labs(title = "Free throw percentage vs salary", x = "free throw percentage",
y = "yearly salary (USD)"))
Guaranteed:
The amount of a player's remaining salary that is guarenteed.
Since it is a direct indication of the salary, I will remove this variable.
all <- dplyr::select(all, !Guaranteed)
Find all character variables:
chrVar <- names(which(sapply(all, is.character)))
chrVar
## [1] "name" "player_id" "team_2021" "team_2022" "Signed.Using"
I will keep tempfor now to keep track of each entries but will remove them before fitting the model.
ggplotly(ggplot(all, aes(x = fct_reorder(as.factor(team_2021), salary, median, .desc = TRUE),
y = salary, fill = reorder(as.factor(team_2021), salary, .fun = "mean", decreasing = TRUE))) +
geom_boxplot() + labs(title = "Salary for each team", x = "team in 2021-22",
y = "yearly salary in 2022-23 (USD)") + theme(axis.text.x = element_text(angle = 45,
hjust = 1)) + guides(fill = guide_legend(title = "Team")))
ggplotly(ggplot(all, aes(x = fct_reorder(as.factor(team_2022), salary, median, .desc = TRUE),
y = salary, fill = reorder(as.factor(team_2022), salary, .fun = "mean", decreasing = TRUE))) +
geom_boxplot() + labs(title = "Salary for each team", x = "team in 2022-23",
y = "yearly salary in 2022-23 (USD)") + theme(axis.text.x = element_text(angle = 45,
hjust = 1)) + guides(fill = guide_legend(title = "Team")))
all$team_2021 <- as.factor(all$team_2021)
all$team_2022 <- as.factor(all$team_2022)
I will also remove this variable as this might be a direct indication to the salary of the player.
all <- dplyr::select(all, !Signed.Using)
Although I have already done some visualisation, I will visualize it again.
summary(all$salary)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 125000 2189160 5823098 10105516 13931408 48070014
ggplotly(ggplot(data = all, aes(x = salary)) + geom_histogram(bins = 50) + labs(title = "Frequency distribution of salary",
x = "yearly salary (USD)", y = "frequency") + theme_minimal())
ggplotly(ggplot(data = all, aes(x = "", y = salary)) + geom_boxplot() + labs(y = "salary") +
coord_flip() + theme_minimal())
g1 <- ggplot(all, aes(x = Age, y = salary)) + geom_point(alpha = 0.7) + theme_minimal() +
geom_smooth(formula = y ~ x, method = "glm")
labs(title = "Age vs Salary")
## $title
## [1] "Age vs Salary"
##
## attr(,"class")
## [1] "labels"
ggMarginal(g1, type = "boxplot")
numVarNames <- names(which(sapply(all, is.numeric)))
all_numVar <- all[, numVarNames]
cor_Mat <- cor(all_numVar)
salCor <- cor_Mat["salary", ]
cor_names <- names(sort(salCor[which(abs(salCor) > 0.5)], decreasing = TRUE))
cor_Mat <- cor_Mat[cor_names, cor_names]
corrplot.mixed(cor_Mat, tl.pos = "lt", upper = "square")
pbio <- c("Age", "Height", "Weight")
attendence <- c("Game_played", "Game_started", "MP", "USGpct")
shooting <- c("FG", "FGA", "FGpct", "eFGpct", "PTS", "TSpct")
X2_point <- c("X2P", "X2PA", "X2Ppct")
X3_point <- c("X3P", "X3PA", "X3Ppct", "X3PAr")
Free_throw <- c("FT", "FTA", "FTpct", "FTr")
rebounding <- c("ORB", "DRB", "TRB", "ORBpct", "DRBpct", "TRBpct")
playmaking <- c("AST", "TOV", "ASTpct", "TOVpct")
defence <- c("STL", "BLK", "PF", "STLpct", "BLKpct")
overall_adstats <- c("PER", "OWS", "DWS", "WS", "WSper48", "OBPM", "DBPM", "BPM",
"VORP")
all_bio <- melt(all, id.vars = "salary", measure.vars = pbio)
ggplot(all_bio, aes(x = value, y = salary)) + geom_point() + geom_smooth(formula = y ~
x, method = glm) + facet_wrap(vars(variable), scales = "free")
all_attendence <- melt(all, id.vars = "salary", measure.vars = attendence)
ggplot(all_attendence, aes(x = value, y = salary)) + geom_point() + geom_smooth(formula = y ~
x, method = glm) + facet_wrap(vars(variable), scales = "free")
all_ovSh <- melt(all, id.vars = "salary", measure.vars = shooting)
ggplot(all_ovSh, aes(x = value, y = salary)) + geom_point() + geom_smooth(formula = y ~
x, method = glm) + facet_wrap(vars(variable), scales = "free")
I will remove FGpct as it is a data that is directly correlated to FG and FGA and it doesn’t have large correlation to salary. I will remove effective field goal percentage as it is similar to true shooting percentage and it has less correlation.
cor(all$FG, (all$FGA * all$FGpct))
## [1] 0.9999042
all <- all %>%
dplyr::select(!eFGpct, FGpct)
all_X2 <- melt(all, id.vars = "salary", measure.vars = X2_point)
ggplot(all_X2, aes(x = value, y = salary)) + geom_point() + geom_smooth(formula = y ~
x, method = glm) + facet_wrap(vars(variable), scales = "free")
all_X3 <- melt(all, id.vars = "salary", measure.vars = X3_point)
ggplot(all_X3, aes(x = value, y = salary)) + geom_point() + geom_smooth(formula = y ~
x, method = glm) + facet_wrap(vars(variable), scales = "free")
all_ft <- melt(all, id.vars = "salary", measure.vars = Free_throw)
ggplot(all_ft, aes(x = value, y = salary)) + geom_point() + geom_smooth(formula = y ~
x, method = glm) + facet_wrap(vars(variable), scales = "free")
all_rb <- melt(all, id.vars = "salary", measure.vars = rebounding)
ggplot(all_rb, aes(x = value, y = salary)) + geom_point() + geom_smooth(formula = y ~
x, method = glm) + facet_wrap(vars(variable), scales = "free")
all_pm <- melt(all, id.vars = "salary", measure.vars = playmaking)
ggplot(all_pm, aes(x = value, y = salary)) + geom_point() + geom_smooth(formula = y ~
x, method = glm) + facet_wrap(vars(variable), scales = "free")
all_df <- melt(all, id.vars = "salary", measure.vars = defence)
ggplot(all_df, aes(x = value, y = salary)) + geom_point() + geom_smooth(formula = y ~
x, method = glm) + facet_wrap(vars(variable), scales = "free")
all_ovAd <- melt(all, id.vars = "salary", measure.vars = overall_adstats)
ggplot(all_ovAd, aes(x = value, y = salary)) + geom_point() + geom_smooth(formula = y ~
x, method = glm) + facet_wrap(vars(variable), scales = "free")
To reduce complexity of the model, I will combine and delete some
variables. A high complexity model will result in overfitting.
Before this, I will save a copy of original data
write.csv(all[, !names(all) %in% c("X")], "dataset/cleaned_data.csv")
rm(list = ls())
players <- all[, c("player_id", "name")]
all <- dplyr::select(all, !c(player_id, name))
cor(as.numeric(fct_reorder(as.factor(all$team_2021), all$salary, median)), all$salary)
## [1] 0.2288759
cor(as.numeric(fct_reorder(as.factor(all$team_2022), all$salary, median)), all$salary)
## [1] 0.1704062
There is no clear correlation between salary and team as each team will have varying salary for their star players and bench players. I will remove this variable.
all <- dplyr::select(all, !c(team_2021, team_2022))
quick_rf <- randomForest(salary ~ ., data = all, ntree = 100, importance = TRUE)
imp_rf <- importance(quick_rf)
imp_df <- data.frame(Variables = row.names(imp_rf), MSE = imp_rf[, 1])
imp_df <- imp_df[order(imp_df$MSE, decreasing = TRUE), ]
ggplot(imp_df[1:20, ], aes(x = reorder(Variables, MSE), y = MSE, fill = MSE)) + geom_bar(stat = "identity") +
labs(x = "Variables", y = "% increase MSE if variable is randomly permuted") +
coord_flip() + theme(legend.position = "none")
There are steal, block and offensive rebounds per game record but their correlation is not very strong. I will combine them into possession gain to make a stronger variable as these action will all result in a possession gain for the team.
cor(all$STL, all$salary)
## [1] 0.460643
cor(all$BLK, all$salary)
## [1] 0.2902457
cor(all$ORB, all$salary)
## [1] 0.1674434
Removing steal, block and offensive rebounds per game and add possession gain.
all <- all %>%
mutate(possGain = STL + BLK + ORB) %>%
dplyr::select(!c(STL, BLK, ORB))
I will combine turn-overs and personal fouls to become possession. These variables have a low importance in the random forest. These actions will result in an possession lost.
imp_df$MSE[imp_df$Variables %in% c("TOV", "PF")]
## [1] 2.479403 1.989752
all <- all %>%
mutate(possLost = TOV + PF) %>%
dplyr::select(!c(TOV, PF))
I will remove anything about two pointer as it is just portion of field goal that is not three pointers.
all <- all %>%
dplyr::select(!c(X2P, X2PA, X2Ppct))
I will remove free throw, field goal and 3 point percentage and attempts. I will replace them by free throw, field goal and 3 three point missed.
all <- all %>%
mutate(FGM = FGA - FG, X3M = X3PA - X3P, FTM = FTA - FT) %>%
dplyr::select(!c(FGA, FGpct, X3PA, X3Ppct, FTA, FTpct))
I will remove game started and replace it will starter. It will be define whether that player has start for more than 50% of their game played. I will also remove game played as it is directly related to minutes played while minutes played has a stronger correlation and importance. I will also change minutes play per game to minutes played in total
all <- all %>%
mutate(starter = ifelse(Game_started/Game_played >= 0.5, 1, 0), MP = MP * Game_played) %>%
dplyr::select(!c(Game_started, Game_played))
all$starter <- as.factor(all$starter)
I will remove total rebound as it is the sum of offensive and defensive rebound while defensive rebound has stronger correlation and importance.
all <- dplyr::select(all, !TRB)
I will remove Box Plus or minus as it is the sum of offensive and defensive box score.
cor(all[, c("OBPM", "DBPM", "BPM", "salary")])[, "salary"]
## OBPM DBPM BPM salary
## 0.6356575 0.1026086 0.5481368 1.0000000
imp_df[imp_df$Variables %in% c("OBPM", "DBPM", "BPM"), ]
## Variables MSE
## BPM BPM 3.512624
## OBPM OBPM 2.176019
## DBPM DBPM 2.127025
cor(all$OBPM + all$DBPM, all$BPM)
## [1] 0.9998765
all$BPM <- NULL
I will remove them as their are directly related to field goal and field goal missed.
all$FTr <- NULL
all$X3PAr <- NULL
As I am not sure about the effect of the variables on the model, I will not remove any extra variable but look at the results first
rm(list = ls()[!ls() %in% c("all", "players")])
numVar <- names(which(sapply(all, is.numeric)))
salary <- all$salary
player_name <- all$name
numVar <- numVar[!numVar %in% c("salary", "X", "name")]
all_num <- all[, numVar]
all_fac <- all[, !names(all) %in% c(numVar, "salary", "name", "X")]
There are 30 numeric predictors and 2 factor predictor.
log_names <- c()
minMax <- c()
sq_names <- c()
for (i in 1:ncol(all_num)) {
if (any(all_num[, i] <= 0)) {
process <- preProcess(as.data.frame(all_num[, i]), method = "range")
all_num[, i] <- predict(process, as.data.frame(all_num[, i]))
minMax <- c(minMax, i)
}
if (skew(all_num[, i]) > 0.8) {
all_num[, i] <- log(all_num[, i] + 1)
log_names <- c(log_names, i)
} else if (skew(all_num[, i]) < -0.8) {
all_num[, i] <- all_num[, i]^2
sq_names <- c(sq_names, i)
}
}
log_names <- names(all_num)[log_names]
minMax_names <- names(all_num)[minMax]
sq_names <- names(all_num)[sq_names]
These column have been log + 1 due to its skewness.
(The + 1 is to prevent logging 0 which result in NA)
all_num[!names(all_num) %in% log_names] <- as.data.frame(scale(all_num[!names(all_num) %in%
log_names]))
I will convert all the remaining variables to numeric (it is required by many machine learning algorithm).
all_fac <- as.data.frame(model.matrix(~. - 1, as.data.frame(all_fac)))
skew(salary)
## [1] 1.564389
qqnorm(salary)
qqline(salary)
Salary is too right skewed and not normally distributed.
salary <- log(salary)
skew(salary)
## [1] 0.04653786
qqnorm(salary)
qqline(salary)
plot(x = 1:length(salary), y = sort(salary))
abline(h = 14)
I will remove the outlier (log(salary)) < 14
alldata <- cbind(X = 1:nrow(all), salary = salary, all_num, all_fac)
row.names(alldata) <- players$name
name <- players$name[alldata$salary > 14]
alldata <- alldata[alldata$salary > 14, ]
inTrain <- sample(1:2, size = nrow(alldata), prob = c(0.8, 0.2), replace = TRUE)
train <- alldata[inTrain == 1, ]
test <- alldata[inTrain == 2, ]
write.csv(alldata[, -c(1, 2)], "dataset/normalized_data.csv")
In the modelling I will use the cross validation root mean squared error to determine which model to choose.
mod_lm <- lm(salary ~ . - X, data = train)
plot(mod_lm, which = 1)
plot(mod_lm, which = 2)
plot(mod_lm, which = 3)
plot(mod_lm, which = 4)
abline(h = 4/nrow(train))
sort(cooks.distance(mod_lm), decreasing = TRUE)[1:3]
## Jaden Springer Desmond Bane Devin Cannady
## 0.42218052 0.04500047 0.04449516
players[players$name %in% names(which(cooks.distance(mod_lm) > 4/nrow(train))), ]
## player_id name
## 43 edwarca01 Carsen Edwards
## 91 banede01 Desmond Bane
## 93 cannade01 Devin Cannady
## 114 kaminfr01 Frank Kaminsky
## 150 sprinja01 Jaden Springer
## 166 culveja01 Jarrett Culver
## 180 harrijo01 Joe Harris
## 189 poolejo01 Jordan Poole
## 220 loveke01 Kevin Love
## 240 jamesle01 LeBron James
## 264 portemi01 Michael Porter Jr.
## 308 tuckera01 Rayjon Tucker
## 322 westbru01 Russell Westbrook
## 324 beysa01 Saddiq Bey
## 344 taylote01 Terry Taylor
## 365 halibty01 Tyrese Haliburton
## 367 jonesty01 Tyus Jones
summary(mod_lm)
##
## Call:
## lm(formula = salary ~ . - X, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.82226 -0.31639 0.05619 0.35760 1.27033
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.483750 1.394815 11.818 < 2e-16 ***
## Age 0.307495 0.037006 8.309 5.45e-15 ***
## MP -0.328533 0.100108 -3.282 0.00117 **
## FG 1.411302 1.044245 1.352 0.17771
## X3P -0.162821 0.181761 -0.896 0.37119
## FT -0.317065 1.277467 -0.248 0.80418
## DRB 2.461169 1.768175 1.392 0.16514
## AST 3.371172 1.418029 2.377 0.01816 *
## PTS -1.402118 0.895750 -1.565 0.11873
## PER 0.373310 0.677644 0.551 0.58218
## TSpct -2.156322 2.921740 -0.738 0.46117
## ORBpct 2.857177 3.851514 0.742 0.45886
## DRBpct 6.086728 6.086241 1.000 0.31821
## TRBpct -11.152085 9.186810 -1.214 0.22588
## ASTpct -3.406084 1.245786 -2.734 0.00669 **
## STLpct -0.259637 0.752566 -0.345 0.73037
## BLKpct 0.398177 1.442842 0.276 0.78279
## TOVpct 0.989632 1.092366 0.906 0.36580
## USGpct 0.034019 0.155156 0.219 0.82662
## OWS 2.474102 1.415557 1.748 0.08168 .
## DWS 0.230342 0.102941 2.238 0.02610 *
## OBPM 0.137806 0.139308 0.989 0.32348
## DBPM -0.044575 0.090066 -0.495 0.62108
## VORP -1.675764 1.884377 -0.889 0.37467
## Weight -0.040123 0.059448 -0.675 0.50032
## Height 0.134180 0.071508 1.876 0.06172 .
## possGain 0.411751 1.275648 0.323 0.74712
## possLost -0.001725 0.107122 -0.016 0.98717
## FGM 0.181024 0.270827 0.668 0.50447
## X3M 0.333070 0.161365 2.064 0.04001 *
## FTM 1.307387 0.587963 2.224 0.02704 *
## PositionC -0.220577 0.196897 -1.120 0.26364
## PositionPF -0.135904 0.143901 -0.944 0.34583
## PositionPG -0.207653 0.124694 -1.665 0.09706 .
## PositionSF -0.214658 0.115568 -1.857 0.06439 .
## PositionSG NA NA NA NA
## starter1 0.206174 0.108258 1.904 0.05796 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5332 on 259 degrees of freedom
## Multiple R-squared: 0.7608, Adjusted R-squared: 0.7285
## F-statistic: 23.54 on 35 and 259 DF, p-value: < 2.2e-16
The r squared of the model is 0.7608406
rmse(train$salary, mod_lm$fitted.values)
## [1] 0.4996236
rmse(test$salary, predict(object = mod_lm, newdata = test))
## Warning in predict.lm(object = mod_lm, newdata = test): prediction from a rank-
## deficient fit may be misleading
## [1] 0.5427067
The train root mean squared error is 0.4996236 while the testing (out of bag) root mean squared error is 0.5427067.
train_control <- trainControl(method = "cv", number = 10)
param_grid <- expand.grid(alpha = 1, lambda = seq(0.001, 0.1, by = 5e-04))
mod_lasso <- train(salary ~ . - 1 - X, data = train, method = "glmnet", trControl = train_control,
tuneGrid = param_grid)
The coefficient of the final model:
coef(mod_lasso$finalModel, mod_lasso$bestTune$lambda)
## 37 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 14.573324574
## Age 0.274923153
## MP .
## FG 0.251616037
## X3P .
## FT .
## DRB 0.722242865
## AST .
## PTS .
## PER .
## TSpct .
## ORBpct .
## DRBpct .
## TRBpct .
## ASTpct -0.056943563
## STLpct -0.125974032
## BLKpct 0.734491701
## TOVpct .
## USGpct .
## OWS .
## DWS 0.004585566
## OBPM .
## DBPM .
## VORP 1.265594873
## Weight .
## Height 0.031495022
## possGain 0.186785359
## possLost .
## FGM 0.242091595
## X3M 0.107999131
## FTM 0.621138375
## PositionC .
## PositionPF 0.031052788
## PositionPG .
## PositionSF .
## PositionSG 0.072862116
## starter1 0.337220128
The r squared of the best tuned model is 0.7117542.
rmse(predict(mod_lasso), train$salary)
## [1] 0.5268912
rmse(predict(mod_lasso, test), test$salary)
## [1] 0.5257583
The train root mean squared error is 0.5268912 while the testing (out of bag) root mean squared error is 0.5257583.
mod_elaNet <- train(salary ~ . - X, data = train, method = "glmnet", tuneLength = 10,
trControl = trainControl(method = "cv", number = 10))
max(mod_elaNet$results$Rsquared)
## [1] 0.717909
The R squared is 0.717909
rmse(predict(mod_elaNet), train$salary)
## [1] 0.5133823
rmse(predict(mod_elaNet, test), test$salary)
## [1] 0.5344562
The train root mean squared error is 0.5133823 while the testing (out of bag) root mean squared error is 0.5344562.
mod_stepAIC <- stepAIC(mod_lm, scope = list(upper = ~., lower = ~1), trace = FALSE,
direction = "both")
mod_stepAIC$coefficients
## (Intercept) Age MP FG DRB AST
## 15.6844715 0.2904833 -0.2329144 1.4273136 2.1189764 2.6561159
## PTS TRBpct ASTpct BLKpct OWS DWS
## -1.2030373 -1.9317407 -2.3767727 1.0820445 1.4693787 0.1176143
## Height FGM X3M FTM starter1 PositionSG
## 0.1077515 0.2414456 0.1694718 1.2876026 0.1944891 0.1708528
rmse(mod_stepAIC$fitted.values, train$salary)
## [1] 0.5072018
rmse(predict(mod_stepAIC, test), test$salary)
## [1] 0.5224379
The train root mean squared error is 0.5072018 while the testing (out of bag) root mean squared error is 0.5224379.
mod_dt <- rpart(salary ~ ., data = train[, -1])
tab <- printcp(mod_dt)
##
## Regression tree:
## rpart(formula = salary ~ ., data = train[, -1])
##
## Variables actually used in tree construction:
## [1] Age AST DRB FT PTS VORP
##
## Root node error: 307.91/295 = 1.0438
##
## n= 295
##
## CP nsplit rel error xerror xstd
## 1 0.489254 0 1.00000 1.00545 0.052413
## 2 0.074139 1 0.51075 0.58534 0.045615
## 3 0.071423 2 0.43661 0.56051 0.043200
## 4 0.043944 3 0.36518 0.49193 0.041213
## 5 0.036267 4 0.32124 0.44736 0.039515
## 6 0.020346 5 0.28497 0.40712 0.037060
## 7 0.015774 6 0.26463 0.36281 0.035747
## 8 0.014477 7 0.24885 0.36702 0.035933
## 9 0.010000 8 0.23438 0.35642 0.035354
rsq <- 1 - tab[, c(3, 4)]
rsq
## rel error xerror
## 1 0.0000000 -0.005452432
## 2 0.4892545 0.414664676
## 3 0.5633933 0.439494159
## 4 0.6348161 0.508074789
## 5 0.6787600 0.552642416
## 6 0.7150270 0.592875452
## 7 0.7353729 0.637187776
## 8 0.7511464 0.632975919
## 9 0.7656234 0.643582720
The r squared of the model is 0, 0.4892545, 0.5633933, 0.6348161, 0.67876, 0.715027, 0.7353729, 0.7511464, 0.7656234, -0.0054524, 0.4146647, 0.4394942, 0.5080748, 0.5526424, 0.5928755, 0.6371878, 0.6329759, 0.6435827.
rmse(train$salary, predict(mod_dt))
## [1] 0.4946026
pred <- predict(mod_dt, test)
rmse(test$salary, pred)
## [1] 0.5850177
The train root mean squared error is 0.4946026 while the testing (out of bag) root mean squared error is 0.5850177.
mtry <- sqrt(ncol(train))
tunegrid <- expand.grid(.mtry = mtry, ntree = c(100, 500, 1000, 1500, 2000, 5000))
mod_rf <- randomForest(salary ~ ., data = train[, -c(1)], ntree = 5000, maxnodes = 100,
trControl = trainControl(method = "cv", number = 10), tuneGrid = tunegrid)
pred_rf_tr <- predict(mod_rf, train)
rmse(pred_rf_tr, train$salary)
## [1] 0.2329279
pred_rf <- predict(mod_rf, test)
rmse(pred_rf, test$salary)
## [1] 0.5003965
The train root mean squared error is 0.5673383 while the testing (out of bag) root mean squared error is 0.5003965.
mod_rft <- train(salary ~ ., train[, -c(1)], method = "rf", trControl = trainControl(method = "cv",
number = 10))
rmse(predict(mod_rft, train), train$salary)
## [1] 0.2325683
rmse(predict(mod_rft, test), test$salary)
## [1] 0.4834867
The train root mean squared error is 0.2325683 while the testing (out of bag) root mean squared error is 0.4834867.
all_pca <- alldata
all_pca$X <- NULL
all_pca <- as.matrix(all_pca)
rownames(all_pca) <- name
pca <- prcomp(all_pca, scale = TRUE)
pca.var <- pca$sdev^2
pca.var.per <- round(pca.var/sum(pca.var), 5)
ggplot(data.frame(value = pca.var.per, index = 1:length(pca.var.per)), aes(x = index,
y = pca.var.per)) + geom_bar(stat = "identity") + theme_minimal()
pca.data <- data.frame(Sample = rownames(pca$x), X = pca$x[, 1], Y = pca$x[, 2],
id = 1:nrow(pca$x))
ggplot(pca.data, aes(x = X, y = Y, label = id)) + geom_text() + xlab(paste("PC1 - ",
pca.var.per[1], "%", sep = "")) + ylab(paste("PC2 - ", pca.var.per[2], "%", sep = "")) +
theme_minimal() + ggtitle("My PCA Graph")
loading_score <- pca$rotation[, 1]
gene_scores <- abs(loading_score)
I will plot out the principle components that contains more than 5% of information of the original dataset.
count = 0
for (i in 1:length(pca.var.per)) {
if (pca.var.per[i] < 0.05) {
break
} else {
print(ggplot(data.frame(score = pca$rotation[, i], var = names(pca$rotation[,
i])), aes(x = reorder(var, abs(score)), y = abs(score), fill = ifelse(loading_score >
0, "positive", "negative"))) + geom_bar(stat = "identity") + theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 7)) +
scale_color_manual(values = list(positive = "blue", negative = "red")) +
labs(title = paste0("Principle Component ", as.character(i), " (", as.character(pca.var.per[i] *
100), "%)"), x = "variable", y = "proportion"))
}
}
cor_Mat <- cor(data.frame(alldata[, numVar], pca$x[, 1:4]))
cor_high <- names(which(rowSums(abs(cor_Mat[, c("PC1", "PC2", "PC3", "PC4")]) > 0.5) >
0))
cor_high <- cor_high[!cor_high %in% c("PC1", "PC2", "PC3", "PC4")]
options(scipen = 100)
round(cor_Mat[cor_high, c("PC1", "PC2", "PC3", "PC4")], digits = 2)
## PC1 PC2 PC3 PC4
## MP -0.78 0.16 0.15 -0.18
## FG -0.93 0.13 0.17 0.08
## X3P -0.55 0.61 0.30 -0.14
## FT -0.86 0.00 0.05 0.18
## DRB -0.79 -0.42 0.08 0.10
## AST -0.77 0.38 -0.39 0.13
## PTS -0.92 0.19 0.19 0.07
## PER -0.68 -0.47 -0.13 -0.13
## TSpct -0.18 -0.48 0.03 -0.55
## ORBpct 0.12 -0.88 -0.04 0.13
## DRBpct -0.25 -0.81 0.01 0.21
## TRBpct -0.12 -0.92 -0.01 0.19
## ASTpct -0.58 0.40 -0.56 0.22
## STLpct -0.04 0.11 -0.59 -0.11
## BLKpct 0.04 -0.71 -0.15 -0.07
## TOVpct -0.02 -0.25 -0.57 0.31
## USGpct -0.68 0.20 0.06 0.45
## OWS -0.70 -0.25 0.03 -0.43
## DWS -0.72 -0.22 -0.08 -0.26
## OBPM -0.79 -0.12 -0.02 -0.30
## DBPM -0.15 -0.40 -0.55 -0.47
## VORP -0.83 -0.18 -0.15 -0.30
## Weight -0.09 -0.77 0.28 0.14
## Height -0.02 -0.80 0.35 0.16
## possGain -0.55 -0.64 -0.12 -0.01
## possLost -0.85 -0.03 -0.10 0.27
## FGM -0.83 0.44 0.16 0.20
## X3M -0.56 0.64 0.27 0.00
## FTM -0.66 -0.34 -0.02 0.33
corrplot(cor_Mat[c("PC1", "PC2", "PC3", "PC4"), cor_high], tl.pos = "lt", method = "number")
tunegrid_neural <- c(100, 75, 50)
tunegrid_neural <- as.data.frame(t(matrix(tunegrid_neural, nrow = 3)))
colnames(tunegrid_neural) <- c("layer1", "layer2", "layer3")
mod_neur <- train(salary ~ . - X, data = train, method = "neuralnet", tuneGrid = tunegrid_neural,
trControl = trainControl(method = "cv", number = 10, verboseIter = TRUE), linear.output = TRUE)
mod_neur$results$RMSE
0.6170212 0.6496249
max(mod_neur$results$Rsquared)
## [1] 0.6624476
mod_neur$results$Rsquared
## [1] 0.6624476
rmse(test$salary, predict(mod_neur$finalModel, test))
## [1] 0.6328699
The train root mean squared error is 0.2325683 while the testing (out of bag) root mean squared error is 0.4834867.